{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import scipy.io as sio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13419\n"
     ]
    }
   ],
   "source": [
    "f = open(\"PE_eventIndex.csv\", 'rb') #读train和test中出现的event\n",
    "event_ids = f.readline().decode().strip().split(\",\") #仅读第一行，event_id,第一个元素是空字符串\n",
    "f.close()\n",
    "print(len(event_ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "#在event.csv中筛选出相应的event记录\n",
    "data = []\n",
    "f = open(\"events.csv\", \"rb\")\n",
    "for line in f:\n",
    "    elements = line.decode().strip().split(\",\")\n",
    "    event_id = elements[0].strip()\n",
    "    if event_id in event_ids:\n",
    "        data.append(elements)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 246,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>100</th>\n",
       "      <th>101</th>\n",
       "      <th>102</th>\n",
       "      <th>103</th>\n",
       "      <th>104</th>\n",
       "      <th>105</th>\n",
       "      <th>106</th>\n",
       "      <th>107</th>\n",
       "      <th>108</th>\n",
       "      <th>109</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 110 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          0           1                         2   3   4   5   6   7   8    \\\n",
       "0   684921758  3647864012  2012-10-31T00:00:00.001Z                           \n",
       "1   244999119  3476440521  2012-11-03T00:00:00.001Z                           \n",
       "2  3928440935   517514445  2012-11-05T00:00:00.001Z                           \n",
       "3  2582345152   781585781  2012-10-30T00:00:00.001Z                           \n",
       "4  1051165850  1016098580  2012-09-27T00:00:00.001Z                           \n",
       "\n",
       "  9   ... 100 101 102 103 104 105 106 107 108 109  \n",
       "0   2 ...   0   1   0   0   0   0   0   0   0   9  \n",
       "1   2 ...   0   0   0   0   0   0   0   0   0   7  \n",
       "2   0 ...   0   0   0   0   0   0   0   0   0  12  \n",
       "3   1 ...   0   0   0   0   0   0   0   0   0   8  \n",
       "4   1 ...   0   0   0   0   0   0   0   0   0   9  \n",
       "\n",
       "[5 rows x 110 columns]"
      ]
     },
     "execution_count": 246,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(data)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>11</th>\n",
       "      <th>12</th>\n",
       "      <th>13</th>\n",
       "      <th>14</th>\n",
       "      <th>15</th>\n",
       "      <th>16</th>\n",
       "      <th>17</th>\n",
       "      <th>...</th>\n",
       "      <th>100</th>\n",
       "      <th>101</th>\n",
       "      <th>102</th>\n",
       "      <th>103</th>\n",
       "      <th>104</th>\n",
       "      <th>105</th>\n",
       "      <th>106</th>\n",
       "      <th>107</th>\n",
       "      <th>108</th>\n",
       "      <th>109</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 102 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          0   9   10  11  12  13  14  15  16  17  ... 100 101 102 103 104 105  \\\n",
       "0   684921758   2   0   2   0   0   0   0   0   0 ...   0   1   0   0   0   0   \n",
       "1   244999119   2   0   2   0   0   0   0   0   0 ...   0   0   0   0   0   0   \n",
       "2  3928440935   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   \n",
       "3  2582345152   1   0   2   1   0   0   0   0   0 ...   0   0   0   0   0   0   \n",
       "4  1051165850   1   1   0   0   0   0   0   2   0 ...   0   0   0   0   0   0   \n",
       "\n",
       "  106 107 108 109  \n",
       "0   0   0   0   9  \n",
       "1   0   0   0   7  \n",
       "2   0   0   0  12  \n",
       "3   0   0   0   8  \n",
       "4   0   0   0   9  \n",
       "\n",
       "[5 rows x 102 columns]"
      ]
     },
     "execution_count": 247,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop(range(1,9), axis = 1, inplace = True) #每行仅保留event_id和count_N\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 248,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13418"
      ]
     },
     "execution_count": 248,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event</th>\n",
       "      <th>count_1</th>\n",
       "      <th>count_2</th>\n",
       "      <th>count_3</th>\n",
       "      <th>count_4</th>\n",
       "      <th>count_5</th>\n",
       "      <th>count_6</th>\n",
       "      <th>count_7</th>\n",
       "      <th>count_8</th>\n",
       "      <th>count_9</th>\n",
       "      <th>...</th>\n",
       "      <th>count_92</th>\n",
       "      <th>count_93</th>\n",
       "      <th>count_94</th>\n",
       "      <th>count_95</th>\n",
       "      <th>count_96</th>\n",
       "      <th>count_97</th>\n",
       "      <th>count_98</th>\n",
       "      <th>count_99</th>\n",
       "      <th>count_100</th>\n",
       "      <th>count_101</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 102 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        event count_1 count_2 count_3 count_4 count_5 count_6 count_7 count_8  \\\n",
       "0   684921758       2       0       2       0       0       0       0       0   \n",
       "1   244999119       2       0       2       0       0       0       0       0   \n",
       "2  3928440935       0       0       0       0       0       0       0       0   \n",
       "3  2582345152       1       0       2       1       0       0       0       0   \n",
       "4  1051165850       1       1       0       0       0       0       0       2   \n",
       "\n",
       "  count_9    ...    count_92 count_93 count_94 count_95 count_96 count_97  \\\n",
       "0       0    ...           0        1        0        0        0        0   \n",
       "1       0    ...           0        0        0        0        0        0   \n",
       "2       0    ...           0        0        0        0        0        0   \n",
       "3       0    ...           0        0        0        0        0        0   \n",
       "4       0    ...           0        0        0        0        0        0   \n",
       "\n",
       "  count_98 count_99 count_100 count_101  \n",
       "0        0        0         0         9  \n",
       "1        0        0         0         7  \n",
       "2        0        0         0        12  \n",
       "3        0        0         0         8  \n",
       "4        0        0         0         9  \n",
       "\n",
       "[5 rows x 102 columns]"
      ]
     },
     "execution_count": 249,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns = df.columns.map(lambda x : (\"event\" if(x == 0) else \"count_\" + str(int(x) - 8))) #重命名列\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event</th>\n",
       "      <th>count_1</th>\n",
       "      <th>count_2</th>\n",
       "      <th>count_3</th>\n",
       "      <th>count_4</th>\n",
       "      <th>count_5</th>\n",
       "      <th>count_6</th>\n",
       "      <th>count_7</th>\n",
       "      <th>count_8</th>\n",
       "      <th>count_9</th>\n",
       "      <th>...</th>\n",
       "      <th>count_97</th>\n",
       "      <th>count_98</th>\n",
       "      <th>count_99</th>\n",
       "      <th>count_100</th>\n",
       "      <th>count_101</th>\n",
       "      <th>user</th>\n",
       "      <th>invited</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>interested</th>\n",
       "      <th>not_interested</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>480579258</td>\n",
       "      <td>9</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>152</td>\n",
       "      <td>1654935837</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-04-27 21:41:02.227000+00:00</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3551837862</td>\n",
       "      <td>15</td>\n",
       "      <td>14</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>237</td>\n",
       "      <td>1654935837</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-04-27 21:41:02.227000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>713488059</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "      <td>1654935837</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-04-27 21:41:02.227000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1197276466</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>52</td>\n",
       "      <td>1654935837</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-04-27 21:41:02.227000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>424436456</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>106</td>\n",
       "      <td>2940917694</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-05-24 04:02:29.999000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 107 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        event count_1 count_2 count_3 count_4 count_5 count_6 count_7 count_8  \\\n",
       "0   480579258       9       4       3       1       1       1       7       0   \n",
       "1  3551837862      15      14       6       4       7       2       2       3   \n",
       "2   713488059       1       4       1       0       0       1       0       0   \n",
       "3  1197276466       2       0       1       0       0       0       0       0   \n",
       "4   424436456      11       1       6       2       6       2       2       0   \n",
       "\n",
       "  count_9      ...       count_97 count_98 count_99 count_100 count_101  \\\n",
       "0       1      ...              0        0        0         0       152   \n",
       "1       7      ...              1        0        0         0       237   \n",
       "2       0      ...              0        0        0         0        20   \n",
       "3       0      ...              0        0        0         0        52   \n",
       "4       1      ...              0        0        0         0       106   \n",
       "\n",
       "         user invited                         timestamp interested  \\\n",
       "0  1654935837       0  2012-04-27 21:41:02.227000+00:00          1   \n",
       "1  1654935837       0  2012-04-27 21:41:02.227000+00:00          0   \n",
       "2  1654935837       0  2012-04-27 21:41:02.227000+00:00          0   \n",
       "3  1654935837       0  2012-04-27 21:41:02.227000+00:00          0   \n",
       "4  2940917694       0  2012-05-24 04:02:29.999000+00:00          0   \n",
       "\n",
       "  not_interested  \n",
       "0              0  \n",
       "1              0  \n",
       "2              0  \n",
       "3              0  \n",
       "4              0  \n",
       "\n",
       "[5 rows x 107 columns]"
      ]
     },
     "execution_count": 214,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#与训练数据相join，找到相应的标签，即interested和not_interested。后续还要用类似方法找出test集中出现的活动的标签\n",
    "train = pd.merge(df, pd.read_csv(\"train.csv\", converters={'event':str}), on = \"event\", how = 'inner')\n",
    "train = train.sort_values(by = \"timestamp\").reset_index(drop=True)  #按时间顺序排序\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(15398, 107)\n"
     ]
    }
   ],
   "source": [
    "print(train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(15398, 1)\n"
     ]
    }
   ],
   "source": [
    "X_train = train.drop([\"event\",\"user\",\"invited\",\"timestamp\",\"interested\",\"not_interested\"], axis = 1)\n",
    "y_train_in = train.interested\n",
    "y_train_not_in = train.not_interested\n",
    "#对数据进行PCA降维\n",
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=0.75)\n",
    "pca.fit(X_train)\n",
    "\n",
    "X_train_pca = pca.transform(X_train)\n",
    "\n",
    "# 降维后的特征维数\n",
    "print(X_train_pca.shape) #居然降成了一维？？？"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {},
   "outputs": [],
   "source": [
    "#将timestamp最晚的20%作为校验集\n",
    "X_val = pd.DataFrame(X_train_pca).tail(int(X_train_pca.shape[0] * 0.2))\n",
    "X_train_part = pd.DataFrame(X_train_pca).head(int(X_train_pca.shape[0] * 0.8) + 1) #int会舍弃小数，补一条\n",
    "\n",
    "#感兴趣的标签\n",
    "y_val_in = y_train_in.tail(int(X_train_pca.shape[0] * 0.2))\n",
    "y_train_part_in = y_train_in.head(int(X_train_pca.shape[0] * 0.8) + 1)\n",
    "\n",
    "#不感兴趣的标签\n",
    "y_val_not_in = y_train_not_in.tail(int(X_train_pca.shape[0] * 0.2))\n",
    "y_train_part_not_in = y_train_not_in.head(int(X_train_pca.shape[0] * 0.8) + 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(12319, 1)\n",
      "(3079, 1)\n",
      "(12319,)\n",
      "(3079,)\n",
      "(12319,)\n",
      "(3079,)\n"
     ]
    }
   ],
   "source": [
    "print(X_train_part.shape)\n",
    "print(X_val.shape)\n",
    "\n",
    "print(y_train_part_in.shape)\n",
    "print(y_val_in.shape)\n",
    "\n",
    "print(y_train_part_not_in.shape)\n",
    "print(y_val_not_in.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "开始训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 220,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 一个参数点（聚类数据为K）的模型，在校验集上评价聚类算法性能\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "import time\n",
    "from sklearn import metrics\n",
    "def K_cluster_analysis(K, X_train, y_train, X_val, y_val):\n",
    "    start = time.time()\n",
    "    \n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    mb_kmeans.fit(X_train)\n",
    "    \n",
    "    # 在训练集和测试集上测试\n",
    "    #y_train_pred = mb_kmeans.fit_predict(X_train)\n",
    "    y_val_pred = mb_kmeans.predict(X_val)\n",
    "    \n",
    "    #以前两维特征打印训练数据的分类结果\n",
    "    #plt.scatter(X_train[:, 0], X_train[:, 1], c=y_pred)\n",
    "    #plt.show()\n",
    "\n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(X_train,mb_kmeans.predict(X_train))\n",
    "    \n",
    "    #也可以在校验集上评估K\n",
    "    v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    end = time.time()\n",
    "    print(\"CH_score: {}, time elaps:{}\".format(CH_score, int(end-start)))\n",
    "    print(\"v_score: {}\".format(v_score))\n",
    "    \n",
    "    return CH_score,v_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 221,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [10, 20, 30,40,50,60,70,80,90,100]\n",
    "CH_scores = []\n",
    "v_scores = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 10\n",
      "CH_score: 0.5079947740275408, time elaps:12\n",
      "v_score: 0.0018719650644760798\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.5684964887338423, time elaps:4\n",
      "v_score: 0.0033426737516110273\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.5508672334529781, time elaps:4\n",
      "v_score: 0.004016649839773557\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.5745220231087055, time elaps:4\n",
      "v_score: 0.0035809264424698506\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.5746224908332466, time elaps:4\n",
      "v_score: 0.004736244884959922\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.5809804253987266, time elaps:4\n",
      "v_score: 0.0055824654454527615\n",
      "K-means begin with clusters: 70\n",
      "CH_score: 0.59691321399414, time elaps:4\n",
      "v_score: 0.005836481767437676\n",
      "K-means begin with clusters: 80\n",
      "CH_score: 0.62824482671254, time elaps:4\n",
      "v_score: 0.006536492444940883\n",
      "K-means begin with clusters: 90\n",
      "CH_score: 0.6286617347254121, time elaps:4\n",
      "v_score: 0.007683677706967162\n",
      "K-means begin with clusters: 100\n",
      "CH_score: 0.650564543128061, time elaps:4\n",
      "v_score: 0.007487923101196764\n"
     ]
    }
   ],
   "source": [
    "#感兴趣的标签训练\n",
    "for K in Ks:\n",
    "    ch,v = K_cluster_analysis(K, X_train_part, y_train_part_in, X_val, y_val_in)\n",
    "    CH_scores.append(ch)\n",
    "    v_scores.append(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x17593a908>]"
      ]
     },
     "execution_count": 223,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xl8VOXd9/HPT5YqWqwVUGQRtKFVKW4potYNymJVrCKKG6BC3Ci2Vnpj9b71xuVxaStVqSYgiriAO+CGiNatYgmiyPKgiCwRWlAWd5bk9/xxTR6GGMiEzMyZzPm+X695kXPmSuY3w+Q7J9e5znWZuyMiIvGwU9QFiIhI9ij0RURiRKEvIhIjCn0RkRhR6IuIxIhCX0QkRhT6IiIxotAXEYkRhb6ISIw0jLqAqpo1a+bt2rWLugwRkXpl1qxZn7l785ra5Vzot2vXjtLS0qjLEBGpV8xsaSrt1L0jIhIjCn0RkRhR6IuIxIhCX0QkRhT6IiIxotAXEYkRhb6ISIwo9EVEcsDkyTBuXOYfR6EvIhKx0aPhtNOgpATKyzP7WAp9EZGIuMOIEVBUBD17wksvQYMGmX3MnJuGQUQkDsrL4fLLobgYBg4MR/mNGmX+cXWkLyKSZd9+C336hMD/059g7NjsBD7oSF9EJKvWrIFTToG334a77oIhQ7L7+Ap9EZEsWbYMevWCjz+Gxx6DM87Ifg0KfRGRLJg7NwT+l1/C1Klw/PHR1KE+fRGRDHv9dfjlL8NonTfeiC7wQaEvIpJRTz4JPXpAy5bwz39Cp07R1qPQFxHJkL//Hfr2hcMOgzffhH33jboihb6ISNq5w7XXhnH4p5wCL78Me+4ZdVVBSqFvZr3MbKGZLTKz4dtoc6aZzTezeWb2SJX7mprZp2Z2dzqKFhHJVZs3w6BBcNNNMHhw6N5p0iTqqraocfSOmTUARgHdgTJgpplNdvf5SW0KgKuBo919rZm1qPJjbgBeS1/ZIiK55+uv4ayz4Lnn4Lrrws0s6qq2lsqRfmdgkbsvdveNwATg1CptBgOj3H0tgLuvqrzDzA4H9gJeSk/JIiK557PPoFs3eOEFuPdeuP763At8SC30WwHLk7bLEvuSdQA6mNlbZjbDzHoBmNlOwF+AYekoVkQkFy1ZAkcfDe+/H7pzLr446oq2LZWLs6r7rPJqfk4BcDzQGnjDzDoC5wHPu/ty285HnpkVAUUAbdu2TaEkEZHc8P774aKr776DadPCePxclkrolwFtkrZbAyuqaTPD3TcBn5jZQsKHwJHAMWZ2GbAb0NjMvnL3rU4Gu3sJUAJQWFhY9QNFRCQnvfoq/OY30LRpGJJ50EFRV1SzVLp3ZgIFZtbezBoD/YDJVdo8A5wAYGbNCN09i939XHdv6+7tgKuAB6sGvohIfTRxYjjCb9MmTJ5WHwIfUgh9d98MDAGmAguAx9x9npmNMLPeiWZTgc/NbD7wKjDM3T/PVNEiIlH629+gXz844ogwrULr1lFXlDpzz63elMLCQi8tLY26DBGR76mogKuvhttug9NPh4cfhp13jrqqwMxmuXthTe00y6aISAo2bYKLLoLx4+Gyy+DOOzO/tGEmKPRFRGrw1Vdh7vupU+HGG8NqV7k4Bj8VCn0Rke1YtQpOOglmz4b77oMLL4y6orpR6IuIbMPHH0PPnrBiBUyaFMK/vlPoi4hUY9Ys+PWvobwcXnkFunSJuqL00NTKIiJVTJsWVrfaZRd46638CXxQ6IuIbOXhh8MR/v77h5WufvrTqCtKL4W+iEjCX/4C550HxxwDr70G++wTdUXpp9AXkdirqIArr4SrroIzzwzTI+++e9RVZYZO5IpI3quoCBdXbd4cblW/vuYaePRRuOIK+OtfYac8PhxW6ItIJP797zCHzRdffD+MtxfQO3J/KrPN3HZbONKvrxddpUqhLyJZt3EjnHYazJwJe+wBDRuGW6NG2/+6SZPU29Z0f/LXBQVhEZQ4UOiLSNZdeSXMmAGPPx6mN5DsyeOeKxHJRQ8/DKNGwR/+oMCPgkJfRLLmgw9g8GA47ji45Zaoq4knhb6IZMX69WEO+h/9CCZMCH3pkn162UUk4yoqYMAAWLIE/vEP2HvvqCuKL4W+iGTcbbeFWSpHjozPKJlcpe4dEcmo6dPDxU/9+sHQoVFXIymFvpn1MrOFZrbIzIZvo82ZZjbfzOaZ2SOJfYeY2duJfXPM7Kx0Fi8iuW358hD2P/sZjB6d/xc+1Qc1du+YWQNgFNAdKANmmtlkd5+f1KYAuBo42t3XmlmLxF3fAP3d/SMz2weYZWZT3X1d2p+JiOSUDRugb9/w71NPwW67RV2RQGp9+p2BRe6+GMDMJgCnAvOT2gwGRrn7WgB3X5X498PKBu6+wsxWAc0Bhb5InrvySnjnHXjyyfybnrg+S6V7pxWwPGm7LLEvWQegg5m9ZWYzzKxX1R9iZp2BxsDHO1qsiNQP48fD3/8Ow4aFYZqSO1I50q+uF67q9EUNgQLgeKA18IaZdazsxjGzlsB4YIC7V3zvAcyKgCKAtm3bply8iOSeOXPg4ovDylM33xx1NVJVKkf6ZUCbpO3WwIpq2kxy903u/gmwkPAhgJk1BZ4DrnX3GdU9gLuXuHuhuxc2b968ts9BRHLEunXhyH6PPXQBVq5KJfRnAgVm1t7MGgP9gMlV2jwDnABgZs0I3T2LE+2fBh5098fTV7aI5JqKCujfH5YuDROp7bVX1BVJdWoMfXffDAwBpgILgMfcfZ6ZjTCz3olmU4HPzWw+8CowzN0/B84EjgUGmtl7idshGXkmIhKpW26BKVPCIiRHHRV1NbIt5qmsLpBFhYWFXlpaGnUZIlILL78MPXvCWWeFWTQ1Hj/7zGyWuxfW1E5X5IpInSxbBmefDQceqAuw6gOFvojssA0bwpz4GzeG8fi77hp1RVITnVsXkR32u9+FJQ+fego6dIi6GkmFjvRFZIeMGwf33gt//GNY71bqB4W+iNTae+/BJZfACSfATTdFXY3UhkJfRGpl7Vro0wf23FMXYNVH+u8SkZRVXoC1fDm89hq0aFHz90huUeiLSMpuvhmefRbuuguOPDLqamRHqHtHRFLy0kvwP/8D554Ll18edTWyoxT6IlKjpUvhnHPgoIOguFgXYNVnCn0R2a7vvgsXYG3aFMbj6wKs+k19+iKyXVdcAaWl8MwzUFAQdTVSVzrSF5FteuABKCmB4cPh1FOjrkbSQaEvItWaPRsuvRS6doUbboi6GkkXhb6IfE/yBViPPqoLsPKJ/itFZCsVFXDeeVBWBq+/rguw8o1CX0S2ctNN8PzzMGoUdOkSdTWSbureEZH/78UX4brrwpH+pZdGXY1kgkJfRABYsiRcbfvzn+sCrHyWUuibWS8zW2hmi8xs+DbanGlm881snpk9krR/gJl9lLgNSFfhIpI+lRdglZeHFbCaNIm6IsmUGvv0zawBMAroDpQBM81ssrvPT2pTAFwNHO3ua82sRWL/j4HrgELAgVmJ712b/qciIjvqt7+FWbNg0iT4yU+irkYyKZUj/c7AIndf7O4bgQlA1cs0BgOjKsPc3Vcl9vcEprn7msR904Be6SldRNJh7FgYMwauvhp69466Gsm0VEK/FbA8absssS9ZB6CDmb1lZjPMrFctvhczKzKzUjMrXb16derVi0idvPsuXHYZ/OpXugArLlIJ/epO53iV7YZAAXA8cDYwxsx+lOL34u4l7l7o7oXNmzdPoSQRqas1a8IFWC1awCOPQIMGUVck2ZBK6JcBbZK2WwMrqmkzyd03ufsnwELCh0Aq3ysiWVZ5Adann8Ljj4OOteIjldCfCRSYWXszawz0AyZXafMMcAKAmTUjdPcsBqYCPcxsDzPbA+iR2CciEbrhBnjhBbjzTjjiiKirkWyqcfSOu282syGEsG4AjHX3eWY2Aih198lsCff5QDkwzN0/BzCzGwgfHAAj3H1NJp6IiNTsq6/gmmtC2PfvDxdfHHVFkm3m/r0u9kgVFhZ6aWlp1GWI5J2XX4bBg8NFWJdfDrffDrvsEnVVki5mNsvdC2tqpytyRfLcunUwaBB07w6NGoVJ1O6+W4EfVwp9kTw2ZUpY1/b+++GPf4T334djjom6KomSQl8kD332WZhHp3fvMCf+O+/Arbfq6F4U+iJ5xR0mToQDDwxDMa+/PqxvW1hjT6/EhebTF8kTK1aEq2snTYJf/CJMr9CxY9RVSa7Rkb5IPeceAv7AA2Hq1DAq55//VOBL9XSkL1KPLVkCRUUwbVo4QXvffVBQEHVVkst0pC9SD1VUhGGXHTvC22+HpQ3/8Q8FvtRMR/oi9cyHH8JFF8Gbb0KPHlBSAvvuG3VVUl/oSF+knti8GW67DQ4+GObODWPvX3xRgS+1oyN9kXrggw/gwgvD8MvTTgvdOS1bRl2V1Ec60hfJYRs3hrH2hx8OS5fCY4+FNWwV+LKjdKQvkqNmzgxH93PnhqtrR46EZs2irkrqOx3pi+SYb7+FYcOgSxdYuxaefRYeekiBL+mhI32RHPL662FkzqJFYRrk22+H3XePuirJJzrSF8kBX34Z5rg/7jgoL4fp08NQTAW+pJtCXyRiL70ULrK65x644oowUqdr16irknyl0BeJyNq1cMEF0LMnNGkSLrYaORJ23TXqyiSfpRT6ZtbLzBaa2SIzG17N/QPNbLWZvZe4DUq67zYzm2dmC8zsTjOzdD4Bkfro6afDBGnjx8Of/gSzZ8NRR0VdlcRBjSdyzawBMAroDpQBM81ssrvPr9J0orsPqfK9RwFHA50Su94EjgP+Uce6ReqlVavgt78N4+0POQSefx4OPTTqqiROUhm90xlY5O6LAcxsAnAqUDX0q+PAzkBjwIBGwH92rFSR+qe8HN59N5yYfeWV0IVTXg433hiWL2zUKOoKJW5SCf1WwPKk7TLgiGra9TGzY4EPgd+7+3J3f9vMXgVWEkL/bndfUNeiRXKVOyxYEEJ++vQw8+X69eG+jh3DMMxLLoEDDoi0TImxVEK/uj54r7I9BXjU3TeY2SXAOKCrmf0EOABonWg3zcyOdffXt3oAsyKgCKBt27a1qV8kckuXbgn5V16Bf/877G/fHs44A7p1C6Nx9tor2jpFILXQLwPaJG23BlYkN3D3z5M2RwO3Jr4+DZjh7l8BmNkLQBfg9SrfXwKUABQWFlb9QBHJKatXh3CvDPmPPw7799orhHvXriHo27ePtk6R6qQS+jOBAjNrD3wK9APOSW5gZi3dfWViszdQ2YWzDBhsZv+H8BfDccDIdBQuki1ffBGulK0M+Tlzwv6mTeH442Ho0BD0Bx0EGpsmua7G0Hf3zWY2BJgKNADGuvs8MxsBlLr7ZGComfUGNgNrgIGJb38C6Ap8QOgSetHdp6T/aYikz3ffhdWoKkP+X/8KJ1933hmOPhpuvjmE/OGHQ0NNZCL1jLnnVm9KYWGhl5aWRl2GxEh5OcyataVf/q23QvA3aAC/+EXoqunWDY48MgS/SC4ys1nuXlhTOx2nSOy4w/z5W0L+tde2jLD5+c/D6Jpu3eDYY0MXjkg+UehLRrnDmjVhIW/3cEv+OpO3qo8zd+6WLpv/JK4W2X9/OPPMEPInnAAtWkT7eolkmkJfMubrr+H008OEYrli7723dNd066b1ZSV+FPqSEV9+CSedFPrHr7kmhK3Z9m877VRzmx25Vf7cdu3CRVEaYSNxptCXtFu3Dk48MSz398gjcNZZUVckIpUU+pJWn38OPXqEOeEffxxOOy3qikQkmUJf0mbVKujeHRYuDFMHn3RS1BWJSFUKfUmLlSvDidElS2DKlBD+IpJ7FPpSZ8uXhytUV66EF14I67yKSG5S6EudfPJJCPw1a8LQTK3+JJLbtEZumq1aBRddFJa/y3cffRSO6tevDxc9KfBFcp9CP83uvBPGjoUuXeCOO8JVoflowYIQ+N9+G65wLaxxxg8RyQUK/TTatAnuuy9czn/iiXDllWEEy3/ybIHIOXNC4FdUhJWhDjkk6opEJFUK/TSaPDmsmvT734chi6NGhVDs1AmmTo26uvR4993woda4cZhj/qCDoq5IRGpDoZ9GxcXQpg38+tfhUv/LLgtXpTZvDr16wR/+ABs2RF3ljnvnnXDS9oc/DIHfoUPUFYlIbSn00+Tjj2HaNBg0KMzDXqljxxD8l18Of/1rmJN94cLo6txRb7wBv/oVNGsWpiLeb7+oKxKRHaHQT5PRo0PYX3TR9+/bZRe4+26YNAmWLYPDDgsne3Ns/ZpteuWV8JdKq1Yh8DUzpUj9pdBPg40bQ4iffHIIxm3p3Rvefx+OOCJ8OPTrFyYny2UvvhhORu+3Xwj87T0/Ecl9Cv00ePppWL0aLr645ratWoVuoJtvhiefDCNf3nor8zXuiMmT4dRT4Wc/g1dfhb32iroiEamrlELfzHqZ2UIzW2Rmw6u5f6CZrTaz9xK3QUn3tTWzl8xsgZnNN7N26Ss/NxQXh7nae/RIrX2DBnD11SHsd9opLMs3YkRYqzVXPPEE9OkDBx8cuneaNYu6IhFJhxpD38waAKOAE4EDgbPN7MBqmk5090MStzFJ+x8Ebnf3A4DOwKo01J0zPvwwHAUPHrz1CdxUHHEEvPcenH02XHddGAq5bFlm6qyNhx8Oc+AfcQS8/DLssUfUFYlIuqRypN8ZWOTui919IzABODWVH574cGjo7tMA3P0rd/9mh6vNQSUl0LAhXHjhjn1/06bw0EPw4INh6oaDDw7dPlEZOxbOPz/89fHii1oYXCTfpBL6rYDlSdtliX1V9TGzOWb2hJm1SezrAKwzs6fMbLaZ3Z74yyEvfPcd3H9/6Pfee++6/azzzw+hX1AAZ5wBRUVhjdlsuvfecIK5e3d47jnYbbfsPr6IZF4qoV/diqJVBxtOAdq5eyfgZWBcYn9D4BjgKuAXwH7AwO89gFmRmZWaWenq1atTLD16Tz4ZZpdM5QRuKn7yE3jzTfiv/4IxY8J8Nu+/n56fXZORI+HSS8MIpEmToEmT7DyuiGRXKqFfBrRJ2m4NrEhu4O6fu3vltaajgcOTvnd2omtoM/AMcFjVB3D3EncvdPfC5s2b1/Y5RKa4GPbfPyweki6NG8Mtt4QRPuvXQ+fO8Le/ZXZM/623hqkjTj89fJDtvHPmHktEopVK6M8ECsysvZk1BvoBk5MbmFnLpM3ewIKk793DzCqTvCswv24l54b588NVqkVFYQROunXrFo7ye/SA3/0uHIGvSvMpcHf43/+F4cPDyeSJE8OHjojkrxrjKnGEPgSYSgjzx9x9npmNMLPeiWZDzWyemb0PDCXRhePu5YSunelm9gGhq2h0+p9G9pWUQKNGMHBg5h6jefMwVv6uu8J89QcfHP4CSAd3uOYauP768BzGjw8npEUkv5nn2FwAhYWFXlpaGnUZ2/Xtt7DPPtCzJ0yYkJ3HnDMnXMG7YAFcdRXcdNOOH5W7h8nf7rgj/KVyzz2Z+WtFRLLHzGa5e40rW+hXfQc89liYPuGSS7L3mJ06QWlpOGn85z+HVao++qj2P6eiIkz+dscdMHRoGLGjwBeJD/2674DiYvjpT7O/AHiTJiGkn3wSFi+GQw+FBx5I/SRvefmWI/thw8KIHatubJaI5C2Ffi198AG8/XYIz6gC8/TTQ3dPYSFccAGcc04Y6bM9mzeHvvv77oP//u8wYkeBLxI/Cv1aKi6GH/wABgyIto7WrcPJ3RtugMcfDxO3zZhRfdtNm8IHw0MPwY03hnl+FPgi8aTQr4Wvvw6jXM44A/bcM+pqwlw/114bho4C/PKX4QRv8sRtGzZA377hg+H228OIHRGJL4V+LUyYAF98kd0TuKk48sgwcVvfvuFDoFs3KCsLo4xOOy1cYXvXXWHUj4jEm4Zs1kLnzuFof+7c3OwecYdx42DIkDCcs0MH+Ne/wsnfoqKoqxORTNKQzTSbPTusdXvxxbkZ+BDqGjgQ3n0X2rcP9d5/vwJfRLbQNZgpKi4Oc9Kcf37UldSsQ4dwUnflSmjbNupqRCSX6Eg/BV9+uWVhkfqyoEijRgp8Efk+hX4KHnkEvvoq907giojUlkK/Bu6ha6dTp7B8oIhIfabQr0FpaTiJm8sncEVEUqXQr0FxcZjz5txzo65ERKTuFPrbsX49PPpomMJg992jrkZEpO4U+tvx0EPwzTfpWwNXRCRqCv1tqDyBe9hhYTZLEZF8oNDfhhkzwjTKOsoXkXyi0N+G4mLYbbewYLiISL5IKfTNrJeZLTSzRWY2vJr7B5rZajN7L3EbVOX+pmb2qZndna7CM2ntWpg4Ec47D374w6irERFJnxrn3jGzBsAooDtQBsw0s8nuPr9K04nuPmQbP+YG4LU6VZpFDz4I332nrh0RyT+pHOl3Bha5+2J33whMAE5N9QHM7HBgL+ClHSsxuypP4HbuHFajEhHJJ6mEfitgedJ2WWJfVX3MbI6ZPWFmbQDMbCfgL8CwOleaJW++CQsW6ChfRPJTKqFf3eQDVVdemQK0c/dOwMvAuMT+y4Dn3X0522FmRWZWamalq1evTqGkzCkuhqZNw4yaIiL5JpXQLwPaJG23BlYkN3D3z919Q2JzNHB44usjgSFmtgT4M9DfzG6p+gDuXuLuhe5e2Lx581o+hfT57LOwlmz//rDrrpGVISKSMaksojITKDCz9sCnQD/gnOQGZtbS3VcmNnsDCwDc/dykNgOBQnf/3uifXDFuHGzcqK4dEclfNYa+u282syHAVKABMNbd55nZCKDU3ScDQ82sN7AZWAMMzGDNGeEOJSVw1FHQsWPU1YiIZIYWRk949VXo2jUc7ffvn/WHFxGpEy2MXkvFxWEpxL59o65ERCRzFPrAqlXw1FMwYADsskvU1YiIZI5CH7j/fti0CYqKoq5ERCSzYh/6FRXhBO6xx8IBB0RdjYhIZsU+9KdPh8WLNUxTROIh9qFfXAzNmkGfPlFXIiKSebEO/ZUr4ZlnYOBA+MEPoq5GRCTzYh36Y8dCeblO4IpIfMQ29MvLYfTocEFWQUHU1YiIZEdsQ/+ll2DpUp3AFZF4iW3oFxdDixbwm99EXYmISPbEMvTLymDKFLjwQmjcOOpqRESyJ5ahf9994aKswYOjrkREJLtiF/qbN8OYMdCjB+y3X9TViIhkV+xC/4UXQveOTuCKSBzFLvSLi6FlSzjllKgrERHJvliF/tKl8PzzcNFF0KhR1NWIiGRfrEJ/zJjw76BB0dYhIhKV2IT+pk1h1M6JJ8K++0ZdjYhINFIKfTPrZWYLzWyRmQ2v5v6BZrbazN5L3AYl9h9iZm+b2Twzm2NmZ6X7CaTq2WfDBGuXXBJVBSIi0WtYUwMzawCMAroDZcBMM5vs7vOrNJ3o7kOq7PsG6O/uH5nZPsAsM5vq7uvSUXxtFBdD69bhSF9EJK5SOdLvDCxy98XuvhGYAJyayg939w/d/aPE1yuAVUDzHS12Ry1eDFOnhr78hjV+zImI5K9UQr8VsDxpuyyxr6o+iS6cJ8ysTdU7zawz0Bj4uJr7isys1MxKV69enWLpqRs9GnbaKYzaERGJs1RC36rZ51W2pwDt3L0T8DIwbqsfYNYSGA9c4O4V3/th7iXuXujuhc2bp/cPgY0bw7z5J58cundEROIsldAvA5KP3FsDK5IbuPvn7r4hsTkaOLzyPjNrCjwHXOvuM+pWbu1NmgSrVukErogIpBb6M4ECM2tvZo2BfsDk5AaJI/lKvYEFif2NgaeBB9398fSUXDvFxWGIZo8eUTy6iEhuqfG0prtvNrMhwFSgATDW3eeZ2Qig1N0nA0PNrDewGVgDDEx8+5nAscCeZla5b6C7v5fep1G9jz6C6dPhxhuhQYNsPKKISG4z96rd89EqLCz00tLStPysYcNg5EhYtizMtyMikq/MbJa7F9bULm+vyN2wAR54AHr3VuCLiFTK29B/6in47DOdwBURSZa3oV9cHBZJ6dYt6kpERHJHXob+ggXw2mtQVBQuyhIRkSAvI7GkJMyXf8EFUVciIpJb8i70v/0Wxo2D00+HFi2irkZEJLfkXeg/8QSsXas1cEVEqpN3oV9cDB06wPHHR12JiEjuyavQnzsX3nornMC16qaJExGJubwK/eJiaNwYBgyIuhIRkdyUN6H/zTcwfjz07QvNmkVdjYhIbsqb0F+3Dnr1gksvjboSEZHclTeLB+6zD0yYEHUVIiK5LW+O9EVEpGYKfRGRGFHoi4jEiEJfRCRGFPoiIjGi0BcRiRGFvohIjCj0RURixNw96hq2YmargaVR11FHzYDPoi4ih+j12Jpejy30WmytLq/Hvu7evKZGORf6+cDMSt29MOo6coVej63p9dhCr8XWsvF6qHtHRCRGFPoiIjGi0M+MkqgLyDF6Pbam12MLvRZby/jroT59EZEY0ZG+iEiMKPTryMzamNmrZrbAzOaZ2RWJ/T82s2lm9lHi3z2irjVbzKyBmc02s2cT2+3N7J3EazHRzBpHXWO2mNmPzOwJM/u/iffIkTF/b/w+8Xsy18weNbOd4/T+MLOxZrbKzOYm7av2/WDBnWa2yMzmmNlh6ahBoV93m4E/uPsBQBfgcjM7EBgOTHf3AmB6YjsurgAWJG3fCtyReC3WAhdFUlU0/ga86O4/Aw4mvC6xfG+YWStgKFDo7h2BBkA/4vX+eADoVWXftt4PJwIFiVsRcE9aKnB33dJ4AyYB3YGFQMvEvpbAwqhry9Lzb51443YFngWMcLFJw8T9RwJTo64zS69FU+ATEufOkvbH9b3RClgO/Jiwat+zQM+4vT+AdsDcmt4PQDFwdnXt6nLTkX4amVk74FDgHWAvd18JkPi3RXSVZdVI4I9ARWJ7T2Cdu29ObJcRfvnjYD9gNXB/ortrjJntSkzfG+7+KfBnYBmwElgPzCK+749K23o/VH5IVkrLa6PQTxMz2w14Evidu38RdT1RMLOTgVXuPit5dzVN4zJkrCFwGHCPux8KfE1MunKqk+irPhVoD+wD7ErowqgqLu+PmmTkd0ehnwZm1ogQ+A+7+1OJ3f8xs5aJ+1sCq6KqL4uOBnqb2RJgAqGLZySNHpl/AAABT0lEQVTwIzNrmGjTGlgRTXlZVwaUufs7ie0nCB8CcXxvAPwK+MTdV7v7JuAp4Cji+/6otK33QxnQJqldWl4bhX4dmZkB9wEL3P2vSXdNBgYkvh5A6OvPa+5+tbu3dvd2hBN0r7j7ucCrwBmJZrF4LQDc/d/AcjP7aWJXN2A+MXxvJCwDuphZk8TvTeXrEcv3R5JtvR8mA/0To3i6AOsru4HqQhdn1ZGZ/RJ4A/iALf3YfyL06z8GtCW82fu6+5pIioyAmR0PXOXuJ5vZfoQj/x8Ds4Hz3H1DlPVli5kdAowBGgOLgQsIB1uxfG+Y2f8CZxFGvc0GBhH6qWPx/jCzR4HjCbNp/ge4DniGat4PiQ/Guwmjfb4BLnD30jrXoNAXEYkPde+IiMSIQl9EJEYU+iIiMaLQFxGJEYW+iEiMKPRFRGJEoS8iEiMKfRGRGPl/oJdimBJ9wC0AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x6e1c1780>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "%matplotlib inline\n",
    "# 绘制不同PCA维数下模型的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对感兴趣的标签，K=100时CH_score最大，约为0.65"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 10\n",
      "CH_score: 0.5854988829864187, time elaps:7\n",
      "v_score: 0.0017922850058761733\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.5643801663584647, time elaps:4\n",
      "v_score: 0.003437911659160513\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.5620732300074337, time elaps:4\n",
      "v_score: 0.003727665547224045\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.5618002427410937, time elaps:4\n",
      "v_score: 0.003813267754421741\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.5751220080699636, time elaps:4\n",
      "v_score: 0.005153528550002239\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.5795959410355889, time elaps:4\n",
      "v_score: 0.006035354879354442\n",
      "K-means begin with clusters: 70\n",
      "CH_score: 0.5943891511685452, time elaps:4\n",
      "v_score: 0.005877372170762056\n",
      "K-means begin with clusters: 80\n",
      "CH_score: 0.595014947596357, time elaps:5\n",
      "v_score: 0.006946843025312192\n",
      "K-means begin with clusters: 90\n",
      "CH_score: 0.638846385357448, time elaps:5\n",
      "v_score: 0.006443839607630537\n",
      "K-means begin with clusters: 100\n",
      "CH_score: 0.6526919684795414, time elaps:5\n",
      "v_score: 0.007647821020299864\n"
     ]
    }
   ],
   "source": [
    "CH_scores = []\n",
    "v_scores = []\n",
    "#不感兴趣的标签训练\n",
    "for K in Ks:\n",
    "    ch,v = K_cluster_analysis(K, X_train_part, y_train_part_not_in, X_val, y_val_not_in)\n",
    "    CH_scores.append(ch)\n",
    "    v_scores.append(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0xcc0a630>]"
      ]
     },
     "execution_count": 225,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAHqtJREFUeJzt3Xl8VOW9x/HPDyJwoVUUomVfbKhLXapRVCwiCiIi7gJuxVrRVqxt1aptvd56b1v1tlWr6BWp+7BUBAkqAgIKLijh6kVAEcQFkE2WeiUKxvzuH8/kJoTETMgkZ2bO9/16zWsyZ05mfjOvk+85ec5znsfcHRERiYcmURcgIiKNR6EvIhIjCn0RkRhR6IuIxIhCX0QkRhT6IiIxotAXEYkRhb6ISIwo9EVEYiQv6gKqatu2rXft2jXqMkREssrChQs/dff82tbLuNDv2rUrxcXFUZchIpJVzOyjVNZT846ISIwo9EVEYkShLyISIwp9EZEYUeiLiMSIQl9EJEYU+iIiMaLQFxHJAEVF8OijDf8+Cn0RkYg98ACcdRaMHg1ff92w76XQFxGJiDvcfDNceSUMGAAzZkDTpg37nhk3DIOISBx89RVccQU8/DBcdhn8139BXiMkso70RUQa2eefw+DBIfBvuQUefLBxAh90pC8i0qjWr4fTToM33wxt+Jdf3rjvr9AXEWkky5eHtvu1a2HKFBg0qPFrUOiLiDSCN94IR/gAc+ZAz57R1KE2fRGRBvbss3DiibDnnvDqq9EFPij0RUQa1JgxcMYZcOCBIfALCqKtR6EvItIA3OH3vw8navv1gxdfhP32i7oqtemLiKRdaSn89KfhKH/48NBLZ489oq4q0JG+iEgabdsGZ54ZAv93v4OHHsqcwAcd6YuIpM3GjaEbZnEx3H9/GF4h0yj0RUTS4P33Qx/81ath0qRw8jYTKfRFROqpuDj0wS8thdmz4dhjo66oZmrTFxGph2nToE8faNkydMnM5MAHhb6IyG575BE4/fTQ9/7VV+F734u6otop9EVE6sgd/uM/4NJLoW9feOklaNcu6qpSozZ9EZE6KC2FkSPDbFcXXQR//zs0axZ1VanTkb6ISIpKSuCcc0Lg33gjPPZYdgU+6EhfRCQlmzaF9vv58+Gee8LRfjZS6IuI1OKDD+DUU+HDD2HiRDj77Kgr2n0KfRGRb/DmmzBwIGzfDi+8AMcfH3VF9aM2fRGRGsycCb17h3b7V17J/sCHFEPfzAaY2TIzW2FmN9awzvlmttTMlpjZ2CrP7Wlma8zs3nQULSLS0B5/PBzhd+8Or70WxsPPBbU275hZU2AU0A9YDSwwsyJ3X1ppnQLgJqCXu28xs32rvMy/Ay+lr2wRkYbhDrffDjfdFPrgT5oEe+0VdVXpk8qR/tHACndf6e47gPFA1aGELgdGufsWAHffUP6EmR0J7AfMSE/JIiIN4+uv4eqrQ+APGxaGWMilwIfUQr8DsKrS49XJZZX1AHqY2StmNt/MBgCYWRPgL8D16ShWRKShfPEFnHcejBoF118PTzyRfX3wU5FK7x2rZplX8zoFQB+gIzDPzL4PXAQ85+6rzKp7meQbmI0ARgB07tw5hZJERNJn82YYPDiMn3PXXXDNNVFX1HBSCf3VQKdKjzsCn1Szznx3/wr4wMyWEXYCxwI/NLOfAd8CmpnZ5+6+08lgdx8NjAYoLCysukMREWkwq1dD//5hPPwJE8LRfi5LJfQXAAVm1g1YAwwFLqiyztPAMOARM2tLaO5Z6e4Xlq9gZsOBwqqBLyISpd/8Bj7+GGbMgBNOiLqahldrm767lwIjgenAO8A/3H2Jmd1qZoOTq00HNpnZUmAOcL27b2qookVE0mHbttA754IL4hH4AOaeWa0phYWFXlxcHHUZIhID48eHXjpz5oSJULKZmS1098La1tMVuSISW4kEdOwYrrqNC4W+iMTSp5/C88+HI/0mMUrCGH1UEZEKTz4ZJkS58MLa180lCn0RiaVEAg4+GA49NOpKGpdCX0Ri58MPw6iZF14I33DdaE5S6ItI7IwbF+6HDYu2jigo9EUkVtxD006vXtC1a9TVND6FvojEyqJFsGRJ/E7gllPoi0isJBKQl5f7Y+zURKEvIrFRVhba8wcMgLZto64mGgp9EYmNefPCqJpxbdoBhb6IxEgiAa1awemnR11JdBT6IhIL27eHq3DPOisEf1wp9EUkFqZNg61b4920Awp9EYmJRAL23RdOPjnqSqKl0BeRnPfZZzB1KgwZErprxplCX0Ry3qRJoU3/gqoTvcaQQl9Ecl4iAfvvDz17Rl1J9BT6IpLT1q6F2bPDUX7cRtSsjkJfRHLa+PHhSty499opp9AXkZyWSMCRR8L3vhd1JZlBoS8iOWvZMli4UEf5lSn0RSRnjR0b2vGHDIm6ksyh0BeRnFQ+WUrfvtC+fdTVZA6FvojkpDfegPffV9NOVQp9EclJiQQ0bw5nnx11JZlFoS8iOae0FCZMCEMo77VX1NVkFoW+iOScWbNgwwY17VRHoS8iOSeRgNat4dRTo64k8yj0RSSnlJTA5Mlw7rmhTV92ptAXkZxSVASff66mnZoo9EUkpyQS0LEj9O4ddSWZSaEvIjlj0yZ4/nkYNgyaKN2qpa9FRHLGk0+G7pqaLKVmCn0RyRmJBBx0EBx2WNSVZK6UQt/MBpjZMjNbYWY31rDO+Wa21MyWmNnY5LLDzey15LJFZqZhj0SkQXz0Ebz8cjiBq8lSalbrFMFm1hQYBfQDVgMLzKzI3ZdWWqcAuAno5e5bzGzf5FMlwCXuvtzM2gMLzWy6u29N+ycRkVgbOzbcq2nnm6VypH80sMLdV7r7DmA8cEaVdS4HRrn7FgB335C8f8/dlyd//gTYAOSnq3gREagYUbNXL+jaNepqMlsqod8BWFXp8erkssp6AD3M7BUzm29mA6q+iJkdDTQD3q/muRFmVmxmxRs3bky9ehER4O23YckS9c1PRSqhX13rmFd5nAcUAH2AYcAYM2v9/y9g1g54HLjU3ct2eTH30e5e6O6F+fn6R0BE6iaRgLw8OO+8qCvJfKmE/mqgU6XHHYFPqllnirt/5e4fAMsIOwHMbE/gWeB37j6//iWLiFQoK4Nx4+CUU6Bt26iryXyphP4CoMDMuplZM2AoUFRlnaeBEwHMrC2huWdlcv3JwGPu/mT6yhYRCebNg1Wr1LSTqlpD391LgZHAdOAd4B/uvsTMbjWzwcnVpgObzGwpMAe43t03AecDvYHhZvZW8nZ4g3wSEYmlRAJatYLBg2tfV8DcqzbPR6uwsNCLi4ujLkNEssD27dCuHZx2Gjz+eNTVRMvMFrp7YW3r6YpcEclazz8PW7aoaacuFPoikrUSCcjPh5NPjrqS7KHQF5Gs9NlnMHUqDBkSumtKahT6IpKVJk2CL79U005dKfRFJCslErD//tCzZ9SVZBeFvohknbVrYfbsMLiaRtSsG4W+iGSdCRPClbgaUbPuFPoiknUSCTjiCDjggKgryT4KfRHJKu+9B8XFOoG7uxT6IpJVEonQjj90aNSVZCeFvohkDfcwQ1bfvtC+fdTVZCeFvohkjQULYMUKNe3Uh0JfRLJGIgHNm8PZZ0ddSfZS6ItIVigthfHjYdAg2GuvqKvJXgp9EckKs2bBhg1q2qkvhb6IZIVEAlq3hoEDo64kuyn0RSTjlZTA5Mlw7rmhTV92n0JfRDLe1Knw+edq2kkHhb6IZLxEAjp0gN69o64k+yn0RSSjbdoE06bBsGHQRIlVb/oKRSSjPflk6K6ppp30UOiLSEYbOxYOOggOOyzqSnKDQl9EMtZHH8G8eeEoX5OlpIdCX0Qy1rhx4X7YsGjryCUKfRHJWIkEHHccdOsWdSW5Q6EvIhlp0SJYvFgncNNNoS8iGSmRgLw8OP/8qCvJLQp9Eck4ZWWhPf+UU6Bt26iryS0KfRHJOC+/DKtWqWmnISj0RSTjJBLQqhUMHhx1JblHoS8iGWXHjnAV7plnhuCX9FLoi0hGmTYNtmxR005DUeiLSEYZOxby86Ffv6gryU0KfRHJGJ99BkVFMGRI6K4p6ZdS6JvZADNbZmYrzOzGGtY538yWmtkSMxtbafmPzGx58vajdBUuIrln8mT48ks17TSkWvelZtYUGAX0A1YDC8ysyN2XVlqnALgJ6OXuW8xs3+TyfYBbgELAgYXJ392S/o8iItkukYDu3aFnz6gryV2pHOkfDaxw95XuvgMYD5xRZZ3LgVHlYe7uG5LLTwFmuvvm5HMzgQHpKV1Ecsm6dTBrFlxwgUbUbEiphH4HYFWlx6uTyyrrAfQws1fMbL6ZDajD74qIMH58uBJXTTsNK5VTJdXtc72a1ykA+gAdgXlm9v0UfxczGwGMAOjcuXMKJYlIrhk7Fo44Ag44IOpKclsqR/qrgU6VHncEPqlmnSnu/pW7fwAsI+wEUvld3H20uxe6e2F+fn5d6heRHLB8OSxYoKP8xpBK6C8ACsysm5k1A4YCRVXWeRo4EcDM2hKae1YC04H+Zra3me0N9E8uExH5f4lEaMcfMiTqSnJfrc077l5qZiMJYd0UeMjdl5jZrUCxuxdREe5Lga+B6919E4CZ/TthxwFwq7tvbogPIiLZyT2E/oknQged8Wtw5r5LE3ukCgsLvbi4OOoyRKSRvPFG6KL597/Dj38cdTXZy8wWunthbevpilwRidTYsdC8OZxzTtSVxINCX0QiU1oaumoOGgR77RV1NfGg0BeRyMyeDevXq9dOY1Loi0hkEolwhH/qqVFXEh8KfRFpdO6wdStMmgTnngstWkRdUXxo8FIR2Yk7bN8OJSUVt23bav+5Ls+VlIT3ATXtNDaFvkjMrFkD118Pa9fWHNJlZXV7TbMwtWHLlhX35T/vu++uy8sft28Pffo0yMeUGij0RWLkgw/gpJNgwwY48kho0wY6ddo1qKuGc20/N2+ukTGzhUJfJCbefRdOPjkcyc+ZA0cdFXVFEgWFvkgMvPUW9O8PTZrASy/BIYdEXZFERb13RHLc/PlhXJsWLWDuXAV+3Cn0RXLYnDmhSadNG5g3D3r0iLoiiZpCXyRHPfccDBwIXbqEwO/SJeqKJBMo9EVy0MSJcOaZcNBBoQ2/XbuoK5JModAXyTGPPhomIznqqDC2Tdu2UVckmUShL5JD7rsPhg8PJ25nzNDIlbIrhb5IjrjjDrjqKjj9dHjmmXDxlEhVCn2RLOcO//qvcMMNoVnnqac0gJnUTBdniWQxd7j2WrjzzjDV4OjR0LRp1FVJJtORvkiW+vpruOKKEPg//zk8+KACX2qn0BfJQl99BZdcEoL+t7+Fu+4KQyyI1EbNOyJZ5ssvYehQmDIF/vQnuPHGqCuSbKLQF8ki27bBWWfBzJlwzz0wcmTUFUm2UeiLZIl//hMGDYJXX4WHHoJLL426IslGCn2RLLBpE5xyCvzP/8C4cXD++VFXJNkqZ079uMMDD8DmzVFXIpJe69aFKQUXL4bJkxX4Uj85E/rLl8PVV4d/ecsnXBbJdh9/DD/8YZjm8NlnQ/OOSH3kTOj36BEuQy8qgnvvjboakfpbvjwE/saNYRydk06KuiLJBTkT+gDXXBOOhK67Dv77v6OuRmT3LV4MvXuH+Wxnz4bjjou6IskVORX6ZvDww5CfH/ox/+//Rl2RSN0VF8MJJ4Tt+aWX4Igjoq5IcklOhT6EscPHjoX334ef/Uzt+5JdXn4Z+vaFb387zHZ10EFRVyS5JudCH8K/xbfcAk88AY89FnU1IqmZORP69w+zXM2bB/vvH3VFkotyMvQhjEfSp0842n/33airEflmU6aE81EFBTB3LnTqFHVFkqtyNvSbNoVEAlq2DGOMf/ll1BWJVG/cODjnHDj8cJgzB/bbL+qKJJflbOgDtG8f5gtdtCiMOS6SacaMgQsvhF694IUXYJ99oq5Icl1KoW9mA8xsmZmtMLNdxvQzs+FmttHM3kreflLpuTvMbImZvWNmfzMzS+cHqM3AgSHw77sPJk1qzHcW+WZ33QWXXx6GV5g2LZy8FWlotYa+mTUFRgGnAgcBw8ysuj4FE9z98ORtTPJ3jwN6AYcC3weOAk5IV/Gp+uMf4aij4LLL4MMPG/vdRXbmDn/4A/zyl2HEzKefDs2QIo0hlSP9o4EV7r7S3XcA44EzUnx9B1oAzYDmwB7A+t0ptD6aNYPx46GsDC64IExAIRIFd7jpJvjd7+Cii+Af/4DmzaOuSuIkldDvAKyq9Hh1cllV55jZIjObaGadANz9NWAOsDZ5m+7u71T9RTMbYWbFZla8cePGOn+IVHTvHuYPfe21MIm0SGMqK4MFC8LYULffDldeGc435WmcW2lkqYR+dW3wVS95mgp0dfdDgReARwHM7LvAgUBHwo6ir5n13uXF3Ee7e6G7F+bn59el/joZMiS0od52WxjLRKQhbd4c/sO85BL4znfg6KPDdSM33BDOMWl6Q4lCKscZq4HKvYY7Ap9UXsHdN1V6+CBwe/Lns4D57v45gJlNA44B5u5uwfV1111hEoqLLw5jk3/nO1FVIrmmrAzeegueey6cmJ0/Pyxr0wYGDAidCvr3D1eNi0QlldBfABSYWTdgDTAUuKDyCmbWzt3XJh8OBsqbcD4GLjezPxH+YzgBuCsdhe+uli1hwoRwYvfii2H6dB1xye7bujVcSTttWritWxeWFxaGdvuBA8PPTZtGW6dIuVpD391LzWwkMB1oCjzk7kvM7Fag2N2LgJ+b2WCgFNgMDE/++kSgL/A2oUnoeXefmv6PUTcHHwx33w0jRoT21ZtuiroiyRbu8PbbFUfzr7wCX38NrVuHrpcDB4Z7XWAlmco8w0YkKyws9OLi4gZ/H3cYNgwmTgyXvWvoWqnJZ5/BrFkVQb9mTVj+gx/AqaeGoO/ZUydlJVpmttDdC2tbL7abqVmYXnHBghD+b76pqyElcId33gkh/9xzYfCz0lLYc0/o1y+E/IAB4YpvkWwT29AH2Guv0LviuOPgJz+Bp54KOwOJn23bwmQl5UH/8cdh+SGHwK9+FYL+uONgjz2irVOkvmId+hBO6N52W5ht67774Kqroq5IGoM7vPdeRZPNSy/Bjh3QqlU4mv/tb0PTjUa7lFwT+9CHcDn87NnhiK5XrzDaoeSekhJ48cWKoF+5Miw/8EC4+uoQ8scfrytkJbcp9AldNh95JIT9kCGwcCF861tRVyXp8sUXMHJkmFHtyy/hX/4lTDJ+3XUh6Lt2jbpCkcaj0E/Kzw/j7/ftGwLikUeirkjSYcMGOOMMeP11uOKKMMBZ797QokXUlYlEQ6FfSZ8+cPPNcOut4Ujw4oujrkjqY+lSOO00WL8+nKQ/66yoKxKJnq5FreLmm8OR4E9/Gk70SXZ64YXQ2+aLL8JJWgW+SKDQryIvLzTztGihaRaz1ZgxFT1vXn899NASkUChX42OHUOb/ltvwa9/HXU1kqqysjCkxuWXh3MzL78MXbpEXZVIZlHo12DQIPjFL+Cee8LMRpLZvvgChg4N11xccQU8+2y4+E5EdqbQ/wa33QZHHgk//nHFFZqSedavhxNPDOMo/fnPcP/9GgdHpCYK/W/QvHkYpqG0NEyzWFoadUVS1dKlcMwxsGhRmPj+2ms1lIbIN1Ho1+K73w0Ds73yCvzbv0VdjVT2wgtw7LHhZPvcuXDmmVFXJJL5FPopGDYsNPH88Y8haCR65T10OncOPXQKax1QVkRAoZ+yv/0NDjggXLC1fn3U1cRXWRnceGPooXPSSeE/sM6do65KJHso9FPUqlWYZnHr1jDRdVlZ1BXFzxdfhGsnbr8drrwSnnkmjHEvIqlT6NfBIYeEidVnzAi9RKTxrF8fhsl46in4y1/CMNjqoSNSdwr9OhoxAs49N4y3Pn9+1NXEw5IlYTrCxYtDD51f/Uo9dER2l0K/jszgwQfDVbtDh4bmHmk4M2eGMXS2b1cPHZF0UOjvhtatQ//9NWvCNIsZNrd8znjwwdBDp0uX0EPnyCOjrkgk+yn0d1PPnqEL51NPhX78kj5lZXDDDaEprV+/MIaOeuiIpIdCvx6uvRZOOSWM0bNoUdTV5IaSEjj/fLjjjjC89dSp6qEjkk4K/Xpo0gQeewz23jt0Jdy2LeqKstu6dWEMnUmT4K9/hVGj1ENHJN0U+vW0777wxBOwbFmYXFt2z5IlYQydxYth8uQwWb166Iikn0I/DU46KXThfPjhMAGL1E15D50dO0IPnTPOiLoikdyl0E+TW26B448PV4ouXx51Ndlj9OjQQ6drV/XQEWkMCv00ycuDsWNhjz3CiciiInj33XD0KrsqKwuzkl1xBfTvH3rodOoUdVUiuU+nydKoUyd49FE455yKJoomTaBbN+jRY9dbx47h+bgpKQkD102aBD/7Gdx9t07YijQW/aml2emnw4YN8N57u97mzt25h0+LFlBQUP0OoU2b3DyRuW4dDB4MxcVw551wzTW5+TlFMpVCvwG0bg1HHx1ulbnD2rW77gwWL4YpU3aemWvvvavfGRQUhBE/s9HixXDaafDpp2He4cGDo65IJH4U+o3IDNq3D7c+fXZ+rrQUPvxw1x3Ciy/C44/vvG6HDtXvELp1C+cUMtGMGXDeeWGHNXeuTtiKREWhnyHy8sLUjN/9LgwcuPNzJSWwYsWuO4SJE2HTpor1mjaF7t0rdgJdukCzZmFHkJdX//vqlqVyTuKBB+Cqq+Dgg8MY+DphKxIdhX4WaNkSDj003KratCl0Ea26Q5g9O0w60tCaNPnmHUWTJvD++2FHNn48fPvbDV+TiNQspdA3swHA3UBTYIy731bl+eHAfwJrkovudfcxyec6A2OAToADA939w3QUL+GEb5s24WrWysrKYMsW+OqrcCstje5++PAwxaF66IhEr9Y/QzNrCowC+gGrgQVmVuTuS6usOsHdR1bzEo8Bf3D3mWb2LUATDTaCJk3CzkBEpLJUeokfDaxw95XuvgMYD6R0obyZHQTkuftMAHf/3N1LdrtaERGpl1RCvwOwqtLj1cllVZ1jZovMbKKZlZ+q6wFsNbNJZvammf1n8j8HERGJQCqhX92lM1XnipoKdHX3Q4EXgEeTy/OAHwLXAUcB3YHhu7yB2QgzKzaz4o0bN6ZYuoiI1FUqob+acBK2XEfgk8oruPsmd9+efPggcGSl330z2TRUCjwNHFH1Ddx9tLsXunthfn5+XT+DiIikKJXQXwAUmFk3M2sGDAWKKq9gZu0qPRwMvFPpd/c2s/Ik7wtUPQEsIiKNpNbeO+5eamYjgemELpsPufsSM7sVKHb3IuDnZjYYKAU2k2zCcfevzew6YJaZGbCQ8J+AiIhEwNyrNs9Hq7Cw0IuLi6MuQ0Qkq5jZQncvrG29GA7sKyISXxl3pG9mG4GPoq6jntoCn0ZdRAbR97EzfR8V9F3srD7fRxd3r7UnTMaFfi4ws+JU/s2KC30fO9P3UUHfxc4a4/tQ846ISIwo9EVEYkSh3zBGR11AhtH3sTN9HxX0Xeyswb8PtemLiMSIjvRFRGJEoV9PZtbJzOaY2TtmtsTMrkku38fMZprZ8uT93lHX2ljMrGlyVNVnko+7mdnrye9iQnI4j1gws9bJkWffTW4jx8Z82/hl8u9ksZmNM7MWcdo+zOwhM9tgZosrLat2e7Dgb2a2IjmC8S7jlu0OhX79lQLXuvuBwDHAVcl5BG4EZrl7ATAr+TgurqFi/CWA24E7k9/FFuCySKqKxt3A8+5+AHAY4XuJ5bZhZh2AnwOF7v59wrAuQ4nX9vEIMKDKspq2h1OBguRtBHB/Wipwd93SeAOmEGYZWwa0Sy5rByyLurZG+vwdkxtuX+AZwtDcnxIm0wE4FpgedZ2N9F3sCXxA8txZpeVx3TbK5+bYhzDu1zPAKXHbPoCuwOLatgfgAWBYdevV56Yj/TQys67AD4DXgf3cfS1A8n7f6CprVHcBv6ZiWsw2wFYPQ2tDzZPw5KLuwEbg4WRz1xgza0VMtw13XwP8GfgYWAv8kzAIY1y3j3I1bQ+pTmBVJwr9NEnO//sU8At3/yzqeqJgZoOADe6+sPLialaNS5exPML8Efe7+w+AbcSkKac6ybbqM4BuQHugFaEJo6q4bB+1aZC/HYV+GpjZHoTAT7j7pOTi9eXzDCTvN0RVXyPqBQw2sw8Jcyn3JRz5tzaz8mG8d5mEJ4etBla7++vJxxMJO4E4bhsAJwMfuPtGd/8KmAQcR3y3j3I1bQ+1TmC1OxT69ZScJ+DvwDvu/tdKTxUBP0r+/CNCW39Oc/eb3L2ju3clnKCb7e4XAnOAc5OrxeK7AHD3dcAqM/tectFJhEmEYrdtJH0MHGNmLZN/N+XfRyy3j0pq2h6KgEuSvXiOAf5Z3gxUH7o4q57M7HhgHvA2Fe3YvyG06/8D6EzY2M9z982RFBkBM+sDXOfug8ysO+HIfx/gTeAir5heM6eZ2eHAGKAZsBK4lHCwFcttw8x+Dwwh9Hp7E/gJoZ06FtuHmY0D+hBG01wP3EKYRnaX7SG5Y7yX0NunBLjU3es92YhCX0QkRtS8IyISIwp9EZEYUeiLiMSIQl9EJEYU+iIiMaLQFxGJEYW+iEiMKPRFRGLk/wBUtU35LnOTDQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x17595fc88>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同PCA维数下模型的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对不感兴趣的标签，K=100时CH_score最大，约为0.65"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 268,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\core\\series.py:696: FutureWarning: \n",
      "Passing list-likes to .loc or [] with any missing label will raise\n",
      "KeyError in the future, you can use .reindex() as an alternative.\n",
      "\n",
      "See the documentation here:\n",
      "http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n",
      "  return self.loc[key]\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAwMAAAD8CAYAAAArDnhyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3XuYXHWd5/H3N92dCx0gXEIICQzBCSgogjSIywxGBMQZINn1MrDsGkU2sy5h3HFdRZ0dR8BdYXiGGRVmZJGLIyMoimQcFbmYcXdWIJ0RuQkk3ENCLobcSEjfvvvHOd1Ud7o7CdXp7nDer+fpp+r8zu/8zrdPVTr1qfOrU5GZSJIkSaqeMSNdgCRJkqSRYRiQJEmSKsowIEmSJFWUYUCSJEmqKMOAJEmSVFGGAUmSJKmihiQMRMT1EbEqIh4ZYH1ExFcjYmlEPBQR76hZNzcilpQ/c4eiHkmSJEnbN1RnBm4Ezhhk/fuBmeXPPOBvASJiX+CLwDuBE4AvRsQ+Q1STJEmSpEEMSRjIzF8AawfpMhv4VhbuAyZFxFTgfcBdmbk2M18G7mLwUCFJkiRpiDQO036mAS/ULC8r2wZq30ZEzKM4q0Bzc/Nxb37zm3dNpZL0BrV48eI1mTl5pOuQJI0ewxUGop+2HKR928bMa4FrAVpaWrK1tXXoqpOkCoiI50a6BknS6DJcVxNaBhxcszwdWD5IuyRJkqRdbLjCwALgI+VVhU4E1mfmCuBO4PSI2Kf84PDpZZskSZKkXWxIpglFxHeAWcD+EbGM4gpBTQCZ+XfAj4E/AJYCm4GPlevWRsSlwKJyqEsyc7APIkuSJEkaIkMSBjLz3O2sT+DCAdZdD1w/FHVIkiRJ2nF+A7EkSZJUUYYBSZIkqaIMA5IkSVJFGQYkSZKkijIMSJIkSRVlGJAkSZIqyjAgSZIkVZRhQJIkSaoow4AkSZJUUYYBSZIkqaIMA5IkSVJFGQYkSZKkijIMSJIkSRVlGJAkSZIqyjAgSZIkVZRhQJIkSaoow4AkSZJUUYYBSZIkqaKGJAxExBkR8URELI2Ii/tZf1VEPFj+PBkR62rWddasWzAU9UiSJEnavsZ6B4iIBuBq4DRgGbAoIhZk5mPdfTLzT2v6XwQcWzPElsw8pt46JEmSJO2coTgzcAKwNDOfzsw24BZg9iD9zwW+MwT7lSRJklSHoQgD04AXapaXlW3biIjfAWYA99Y0j4+I1oi4LyLmDEE9kiRJknZA3dOEgOinLQfoew5wW2Z21rQdkpnLI+Iw4N6IeDgzn9pmJxHzgHkAhxxySL01S5IkSZU3FGcGlgEH1yxPB5YP0Pcc+kwRyszl5e3TwEJ6f56gtt+1mdmSmS2TJ0+ut2ZJkiSp8oYiDCwCZkbEjIgYS/GCf5urAkXEEcA+wC9r2vaJiHHl/f2Bk4DH+m4rSZIkaejVPU0oMzsiYj5wJ9AAXJ+Zj0bEJUBrZnYHg3OBWzKzdgrRW4BvREQXRTD5Su1ViCRJkiTtOtH7tfnuoaWlJVtbW0e6DEnarUTE4sxsGek6JEmjh99ALEmSJFWUYUCSJEmqKMOAJEmSVFGGAUmSJKmiDAOSJElSRRkGJEmSpIoyDEiSJEkVZRiQJEmSKsowIEmSJFWUYUCSJEmqKMOAJEmSVFGGAUmSJKmiDAOSJElSRRkGJEmSpIoyDEiSJEkVZRiQJEmSKsowIEmSJFWUYUCSJEmqKMOAJEmSVFFDEgYi4oyIeCIilkbExf2s/2hErI6IB8ufC2rWzY2IJeXP3KGoR5IkSdL2NdY7QEQ0AFcDpwHLgEURsSAzH+vT9dbMnN9n232BLwItQAKLy21frrcuSZIkSYMbijMDJwBLM/PpzGwDbgFm7+C27wPuysy1ZQC4CzhjCGqSJEmStB1DEQamAS/ULC8r2/r6QEQ8FBG3RcTBO7ktETEvIlojonX16tVDULYkSZJUbUMRBqKftuyz/I/AoZl5NHA3cNNObFs0Zl6bmS2Z2TJ58uTXXawkSZKkwlCEgWXAwTXL04HltR0y87eZubVc/N/AcTu6rSRJkqRdYyjCwCJgZkTMiIixwDnAgtoOETG1ZvFs4Dfl/TuB0yNin4jYBzi9bJMkSZK0i9V9NaHM7IiI+RQv4huA6zPz0Yi4BGjNzAXAn0TE2UAHsBb4aLnt2oi4lCJQAFySmWvrrUmSJEnS9kVmv1P0R7WWlpZsbW0d6TIkabcSEYszs2Wk65AkjR5+A7EkSZJUUYYBSZIkqaIMA5IkSVJFGQYkSZKkijIMSJIkSRVlGJAkSZIqyjAgSZIkVZRhQJIkSaoow4AkSZJUUYYBSZIkqaIMA5IkSVJFGQYkSZKkijIMSJIkSRVlGJAkSZIqyjAgSZIkVZRhQJIkSaoow4AkSZJUUYYBSZIkqaKGJAxExBkR8URELI2Ii/tZ/6mIeCwiHoqIeyLid2rWdUbEg+XPgqGoR5IkSdL2NdY7QEQ0AFcDpwHLgEURsSAzH6vp9iugJTM3R8QngCuAPyrXbcnMY+qtQ5IkSdLOGYozAycASzPz6cxsA24BZtd2yMyfZ+bmcvE+YPoQ7FeSJElSHYYiDEwDXqhZXla2DeTjwE9qlsdHRGtE3BcRcwbaKCLmlf1aV69eXV/FkiRJkuqfJgREP23Zb8eI/wC0AO+uaT4kM5dHxGHAvRHxcGY+tc2AmdcC1wK0tLT0O74kSZKkHTcUZwaWAQfXLE8HlvftFBGnAl8Azs7Mrd3tmbm8vH0aWAgcOwQ1SZIkSdqOoQgDi4CZETEjIsYC5wC9rgoUEccC36AIAqtq2veJiHHl/f2Bk4DaDx5LkiRJ2kXqniaUmR0RMR+4E2gArs/MRyPiEqA1MxcAfwlMBL4XEQDPZ+bZwFuAb0REF0Uw+UqfqxBJkiRJ2kUic/ebft/S0pKtra0jXYYk7VYiYnFmtox0HZKk0cNvIJYkSZIqyjAgSZIkVZRhQJIkSaoow4AkSZJUUYYBSZIkqaIMA5IkSVJFGQYkSZKkijIMSJIkSRVlGJAkSZIqyjAgSZIkVZRhQJIkSaoow4AkSZJUUYYBSZIkqaIMA5IkSVJFGQYkSZKkimoc6QJ2d1dfDZdfDm1tMG8efOlLENG7/dhj4ZFHoL29dx8o2t79bli8uOj7zDNw6KGvbb9uXdF3/Hj44z8utr3mmqu5/PLLWbduHe3t7bz66qvb1NXc3MzWrVvJTDo7O2lqaqK9vR2AAw88kMxk3rx5HHDAAXzmM59hy5YtAIwbN46tW7f2jNPY2Mib3vQmnnjiiV7jn3TSSfziF79gzJjeeXLjj2H9zZCbgQC6etc1Zgpke7l+27KL+r4Bry6GjT+A7IDmU+HVh6B9ac148dr4Y/YBEiaeDkuubGXTrzYVv8vB4zjhmRNoaGjg0Q8/yuofrIYuiKZgzN5jmP6fp9N0QBPP/o9n6VjfUf7CMGb8GBr2aOCgeQdx6JcOJSJ48eoXef7y5+lq62LPY/dkw/0b6FjfwZgJY5j+p9OZcckMovtBHUD3GG0vtbF8xpMsO+JxGqc08I7Tf5+TP3gmEcEDP7mXe//hB3R2dALwx3/1F/zob7/FS888T2dHBxd+9TImTd5vwH10dnTy95f8Va/+S3/1ML9c8DM6Ozo59r2/17Ovvtssf+pZsquLCXtOpOvICXy160bautqYd9w8vjTrS1yz6Bou/5fL2dqxlbGNY1n1yiraOtt45pPPcOikQ7eppb2znXff+G4Wr1hMW2cbUydOpSu7esbb3vEaTHtnOyffeDL3L7ufJAEGrGNH/OSffs0Pb2ulo6OLU9/3Vs4570Qigk9/8h945unVAOy//0Suue6jNDQ07NQYr0dHRyd//rnv89TSVXR0dPK3132MA6bstdP1725i4cILgc8CY4FrgS/mrFk5slUNjVi4sAn4Z+A4it9vRs6a9Wwd471hj9Xu7I36uAz181e71sKFsc3jNWtWPjtQf88M1GHxYpg/v/i58kq49FK4/fbe7RdeCD/9KbznPb37dIuAM8+E2bO3HXfOHNi4ETZsgHPOKba98srFzJ8/nzlz5rBx48Z+gwDAK6+8wvHHH09nZ/Gi8l3vehdHHXUUACeeeCJXXnkll156KRdddBFbtmzh6KOPBugVBAA6Ojq2CQIA//Iv/8LFF1/cq61tKay7FvIVaJrJNkEAoGslNO7NgEEAYNPPinEm/gFM+ihsvA0a9nhtvHHHAfnacvN7in4bvgv50ngmHDGBvU/Zm60vbOWxDzzGS99+idXfW81e79wLErIt2fv4vXnu0udYetFSOtZ1FOMl0A5dG7s44I8O4LlLn2PN7WvYuHgjS+YvYdr8aUy7cBprf7qWjpc7mPyhyXS90sXzlz3PmtvXDPwLQc8YB/2Xg2ic08XT7/hXpi75XU7+g7P4vz/4MU8sepAVTz/HXd/6Hr9z5OEc9KbfAeCZh3/DzHe8lcOPO3rQ8btF0Kv/qudf5M4bbqXl9Fm897x/17OvvtsceOh0sqs4oIe85yi23r+a/7L/x7jy9Cu59BeXcuX/u5L5P5nP/BPmc8VpV7BswzKOm3rcdmoJzjz8TE4+5GQAPvL2j/SMd/vjtw+67fZ/z+APZ/4hRx1wVF3jADy1dCXX/d1C3n/m2/nI+b/Pbbc+wP2/fIrv3XI/zzy9moOmTeKtR09nzZpN/OX/+qedGuP1igiOO/5Qjn/nYa+7/t1NLFx4HPD18ufTwP8A/u2IFjW0EvgRcEe9A1XgWO2W3uCPy5A9fzUsdurxMgzUYcGC4vb88+G886C5Ge64o3d7+WY8W7f27tOtsRE+/3k4/PBtxx03rrjdY4/iDEFzM3z72wvKdeN61dL3HfoJEyYwc+bMnvarrrqKSZMmAdDe3s55551HU1NTT/9bb72VxsbeJ4r67qOv73//+72WtywatHuPjpWDr99yX3HbfCrs8W6I8dC57rX1e51btHWbcFLRr6uzk+YD9+eIG4/g6J8VL4ZfvudlVvzvFcV4xzT3bLP58c1EU593Trt//QZof7mdMc1jWHPHGtYsKF7oH3j+gWT7a2/w/O7XfpcxzWOgEdbcMXgY6B5j6gVT6fhQceZiytOH8ZZjjqNp3DiebP01Ty5+CIDZF57PoW99MwDPPfoEJ815P/tOPWDwg1Ya09DQq/8zjzwOwNtn/Rve+nsn9Oyr7zYT9pzYs/zwHkvYGu28af1BnPe282huaubbD30bgPOPPZ+PvP0jNDc1s6lt06C1NI5p5PO//3le7SiS34eP+nDPeHc8Ud//J41jGvmzk/+M2UfM3n7n7Vh0/9MAnHLqUZw86wjGj29i0f1Pc/fPHgFg/n89nT+/ZA4AD/962U6N8Xo1NIzhAx8+gYOmTXrd9e+Gzi5vrwduBl4B6n+AR4mcNasjZ836n8CTQzDcG/pY7cbesI/LED9/tYvNmpUds2blDj9eQxIGIuKMiHgiIpZGxMX9rB8XEbeW6++PiENr1n2ubH8iIt43FPUMl5Xli9o99yzeXZ04sWirbV+1qrj/8su9++zIuOvWvTbOypXFtmvXrizXreu1zdixY3stNzc3s3Llyp4w0NzczObNmwFYv349EdHrxf9BBx3UKxwU+91z0DrXr1/fa7n2BXvXKwNvl+2DDltMIQLGTCiOWYyHro2vrW+Y0DsMjBlf9MvOThrGjWWPN+3RM5Ujtybtq4sddm0pTyWMgY71HURj7zAwpqk4VtEQtK9qp2FiA20r22hb2QZA456NtK1q6+nftFcTDRMbiDHR02cgtWO8sr74ZRo6GokIxo4fxyvrN/a0j50wrmeKx5YNgxzIHbBl46ZeY3bvq6/atjVbfkvbmHbaNr5KRDBx7ETWvroWgD3H7tnT9krbjtX2SnvRr7mpuWfblZu2849gGK1fVzzhJuzRREQwfnwT69ZtZsvm4jE7cOrePc+n9vbOnRpjOIzkvofYlPJ2YzmtYlNNm3rzWI1OPi7aLdUdBiKiAbgaeD9wJHBuRBzZp9vHgZcz83eBq4DLy22PBM4BjgLOAK4px9stTCn/iW/YAJnFlJ4pU3q3H1C+obvPPr377Mi4e+9d3HZvs3Ej7LvvlHLd3r22aWvr/WJ006ZNTJkypWea0KZNm9hjjz16ts1MOjo6evovW7ZsmzE2bNgwaJ19a2ioeRNzTDMDiqaB1wFEUSZdm4tjlltgTE0u6dxctHXr2lL0i4YGOre2sfnJzT2/d4wLmiYXOxwzvny6d0Hj3o1kR+9pnF1tRVjIzqTpgCY6N3YydspYxk4pglbHhg7GHvBa6Gpf307nxk6yM3v6DKR2jOa9i1+ms6mdzKTt1Vdp3nvPnvatW14ls6htwl6DHMgd0P2Of/eY3fvqq7Zt/wn7Ma6riaaJ48lMNrZtZN/x+wKwYeuGnrbmsTtWW3NT0W9T26aebadMHD3/P+49qXjCbd7cRmayZUsbkybtwYQ9isds+Ysv9zyfmpr6//M00BjDYST3PcS6E+JesXBhAHvWtKk3j9Xo5OOi3dJQnBk4AViamU9nZhtwC9ueFpsN3FTevw14bxRvfc4GbsnMrZn5DLC0HG+3cOaZxe0NN8DNN8PmzXDWWb3bu998b2rq3afW44/Db39b3H/qKTjxxOJ+9/T9zZuLYLB5M5x77pnlut5z+7u6ek/Qf/XVV9m0aVPPi8rLL7+cTZs29ay7+uqrez5QDHDuuef2vODp1jcc9DVnzpxey+NbahYG+bhUw3ZmvIwtp0m/cg9s/mfIrTCuZsr8hpuKtm6v3AmbflxMd3nlpd/yxMef4KHTiyk3k2ZN4sDzDwRg069fm9Yy4YgJvab8AND52m3jpEa6Nnex31n7sd+ZxQd2X7rhpV4fuV/6J0vp2twFnbDfWQN/qBfoNcbERyYDsHLGMzx05320b21j5nFHM/MdbwPglwt+xotLnwVg8vSDeO43S9iyqXh3/eWVq9n48vptd1BjzYsv9fTfr5wu9NDCX/LI/32gZ199TZ5+UM/9w1dOZ2w28fTey7n54ZvZ3L6Zc992LgA3PHhDT9uBE4vj+tTap1ixcUW/tTy+5nEOaC5q+Mbib3D1oqvZ3L6Zsw4/q9/+O+PxNY+zdO3SnuUHlj0wYB2DaTl+BgA/v/sxfrHwCbZu7aDlhBm855S3AHDNV+/mkj//IQBHvW3aTo1Rj2UvrGXjxmKK1UsvrePltf2fidkV+x4hPypvPwacB+wB/OPIlTP0YuHCNwPdfyzeFAsXTn2dQ73hj9Vu6g39uAzh81fDYOHC6PV4LVwYAz5e0f1i8fWKiA8CZ2TmBeXyfwTemZnza/o8UvZZVi4/BbwT+Avgvsz8dtn+TeAnmXlbP/uZB8wDOOSQQ4577rnn6qp7qHzta3DFFcWVgC64AC67rJiyUtt+zDHw6KPF5wdq+3Tre9GPuXPhuOOK7btn4owfD//pPxXbfv3rX+OKK67ouZpQ32CwI8aPH8+nPvUppkyZwsUXX9xzNaG+IoKZM2fy5JO9p51NnjyZl156adurCf0I1v9D+c59f1cTmlxcIYgtkIN8iLhpJnStLa8mdFrxIeJBNcGes2HJlYt45dfFi6ax08byzufeSUNDA4988BHW/HDNa1cT2msM0+ZNo2lKE8998bneVxMaV1xNaOoFU5lxWXGVoGVfW8YLV7xAV1sXE98+kQ2LNtC5oZMx48cw7ZPTOOzLh2336i3dY2xdtpXlM5/kxbc8To7p4rDmt/GBvz+fiGDRT3/Oz2767qDjHH3yiZz1ibkDrv/yuZ/otXzQmw5l48vr6ero4O2nnMSsD5+9Ta19txmz3ziuOez7tHW1ccGxF3DZKZfx9Qe+zhX/7wraOttY9cqqXv3nvn0uN865cZta4ku99zO+cTyfOvFTXHbKZXVf7abv2IPVsT0//scH+eEPFtPR0cl7T3sr//4/vouI4L/9yc08+0zxeY9992vm7775sQGvJjTQGK/XB876m17Ls055Cxf96el17TsiFmdmSz9DjAqxcOFFwGcorn5xHfBnb4QrsXSLhQv7/i435axZH32dY72hj9Xu6o38uAzl81e73sKFsc3jNWtWfrS/vkMRBj4EvK9PGDghMy+q6fNo2ac2DJwAXAL8sk8Y+HFmfp9BtLS0ZGtra111S1LVjPYwIEkafkMxTWgZcHDN8nRg+UB9IqIR2BtYu4PbSpIkSdoFhiIMLAJmRsSMiBhL8YHgBX36LAC65zV8ELg3i1MSC4BzyqsNzQBmAg8MQU2SJEmStqPubyDOzI6ImA/cCTQA12fmoxFxCdCamQuAbwJ/HxFLKc4InFNu+2hEfBd4DOgALszM/q/dJ0mSJGlI1f2ZgZHgZwYkaef5mQFJUl9+A7EkSZJUUYYBSZIkqaIMA5IkSVJFGQYkSZKkijIMSJIkSRVlGJAkSZIqyjAgSZIkVZRhQJIkSaoow4AkSZJUUYYBSZIkqaIMA5IkSVJFGQYkSZKkijIMSJIkSRVlGJAkSZIqyjAgSZIkVZRhQJIkSaoow4AkSZJUUYYBSZIkqaLqCgMRsW9E3BURS8rbffrpc0xE/DIiHo2IhyLij2rW3RgRz0TEg+XPMfXUI0mSJGnH1Xtm4GLgnsycCdxTLve1GfhIZh4FnAH8dURMqln/3zPzmPLnwTrrkSRJkrSD6g0Ds4Gbyvs3AXP6dsjMJzNzSXl/ObAKmFznfiVJkiTVqd4wMCUzVwCUtwcM1jkiTgDGAk/VNH+5nD50VUSMG2TbeRHRGhGtq1evrrNsSZIkSdsNAxFxd0Q80s/P7J3ZUURMBf4e+FhmdpXNnwPeDBwP7At8dqDtM/PazGzJzJbJkz2xIEmSJNWrcXsdMvPUgdZFxMqImJqZK8oX+6sG6LcX8E/An2XmfTVjryjvbo2IG4BP71T1kiRJkl63eqcJLQDmlvfnAnf07RARY4HbgW9l5vf6rJta3gbF5w0eqbMeSZIkSTuo3jDwFeC0iFgCnFYuExEtEXFd2efDwMnAR/u5hOjNEfEw8DCwP3BZnfVIkiRJ2kGRmSNdw05raWnJ1tbWkS5DknYrEbE4M1tGug5J0ujhNxBLkiRJFWUYkCRJkirKMCBJkiRVlGFAkiRJqijDgCRJklRRhgFJkiSpogwDkiRJUkUZBiRJkqSKMgxIkiRJFWUYkCRJkirKMCBJkiRVlGFAkiRJqijDgCRJklRRhgFJkiSpogwDkiRJUkUZBiRJkqSKMgxIkiRJFWUYkCRJkiqqrjAQEftGxF0RsaS83WeAfp0R8WD5s6CmfUZE3F9uf2tEjK2nHkmSJEk7rt4zAxcD92TmTOCecrk/WzLzmPLn7Jr2y4Gryu1fBj5eZz2SJEmSdlC9YWA2cFN5/yZgzo5uGBEBnALc9nq2lyRJklSfesPAlMxcAVDeHjBAv/ER0RoR90VE9wv+/YB1mdlRLi8Dpg20o4iYV47Runr16jrLliRJktS4vQ4RcTdwYD+rvrAT+zkkM5dHxGHAvRHxMLChn3450ACZeS1wLUBLS8uA/SRJkiTtmO2Ggcw8daB1EbEyIqZm5oqImAqsGmCM5eXt0xGxEDgW+D4wKSIay7MD04Hlr+N3kCRJkvQ61DtNaAEwt7w/F7ijb4eI2CcixpX39wdOAh7LzAR+DnxwsO0lSZIk7Rr1hoGvAKdFxBLgtHKZiGiJiOvKPm8BWiPi1xQv/r+SmY+V6z4LfCoillJ8huCbddYjSZIkaQdF8Qb97qWlpSVbW1tHugxJ2q1ExOLMbBnpOiRJo4ffQCxJkiRVlGFAkiRJqijDgCRJklRRhgFJkiSpogwDkiRJUkUZBiRJkqSKMgxIkiRJFWUYkCRJkirKMCBJkiRVlGFAkiRJqijDgCRJklRRhgFJkiSpogwDkiRJUkUZBiRJkqSKMgxIkiRJFWUYkCRJkirKMCBJkiRVlGFAkiRJqqi6wkBE7BsRd0XEkvJ2n376vCciHqz5eTUi5pTrboyIZ2rWHVNPPZIkSZJ2XL1nBi4G7snMmcA95XIvmfnzzDwmM48BTgE2Az+r6fLfu9dn5oN11iNJkiRpB9UbBmYDN5X3bwLmbKf/B4GfZObmOvcrSZIkqU71hoEpmbkCoLw9YDv9zwG+06ftyxHxUERcFRHj6qxHkiRJ0g5q3F6HiLgbOLCfVV/YmR1FxFTgbcCdNc2fA14CxgLXAp8FLhlg+3nAPIBDDjlkZ3YtSZIkqR/bDQOZeepA6yJiZURMzcwV5Yv9VYMM9WHg9sxsrxl7RXl3a0TcAHx6kDqupQgMtLS05PbqliRJkjS4eqcJLQDmlvfnAncM0vdc+kwRKgMEEREUnzd4pM56JEmSJO2gesPAV4DTImIJcFq5TES0RMR13Z0i4lDgYOCf+2x/c0Q8DDwM7A9cVmc9kiRJknbQdqcJDSYzfwu8t5/2VuCCmuVngWn99Dulnv1LkiRJev38BmJJkiSpogwDkiRJUkUZBiRJkqSKMgxIkiRJFWUYkCRJkirKMCBJkiRVlGFAkiRJqijDgCRJklRRhgFJkiSpogwDkiRJUkUZBiRJkqSKMgxIkiRJFWUYkCRJkirKMCBJkiRVlGFAkiRJqijDgCRJklRRhgFJkiSpogwDkiRJUkUZBiRJkqSKqisMRMSHIuLRiOiKiJZB+p0REU9ExNKIuLimfUZE3B8RSyLi1ogYW089kiRJknZcvWcGHgH+HfCLgTpERANwNfB+4Ejg3Ig4slx9OXBVZs4EXgY+Xmc9kiRJknZQXWEgM3+TmU9sp9sJwNLMfDoz24BbgNkREcApwG1lv5uAOfXUI0mSJGnHNQ7DPqYBL9QsLwPeCewHrMvMjpr2aQMNEhHzgHnl4qaI2F4IGWr7A2uGeZ87wrp2zmitC0Zvbda1c0ZrXQBHjHQBkqTRZbthICLuBg7sZ9UXMvOOHdhH9NOWg7T3KzOvBa7dgf3tEhHRmpkDfi5ipFjXzhmtdcHorc26ds5orQuK2ka6BknS6LLdMJCZp9a5j2XAwTXL04HlFO+cTYqIxvLsQHe7JEmSpGEwHJcWXQTMLK8cNBY4B1iQmQn8HPhg2W8usCNnGiRJkiQNgXpn416SAAAF0ElEQVQvLfpvI2IZ8C7gnyLizrL9oIj4MUD5rv984E7gN8B3M/PRcojPAp+KiKUUnyH4Zj317GIjNkVpO6xr54zWumD01mZdO2e01gWjuzZJ0giI4g16SZIkSVXjNxBLkiRJFWUYkCRJkirKMLAdEXFRRDwREY9GxBU17Z+LiKXluveNUG2fjoiMiP3L5YiIr5Z1PRQR7xiBmv4yIh4v9397REyqWTeixywizij3vTQiLh7u/dfUcXBE/DwiflM+rz5Ztu8bEXdFxJLydp8Rqq8hIn4VET8ql2dExP1lXbeWFwIYibomRcRt5fPrNxHxrtFwzCLiT8vH8ZGI+E5EjB+JYxYR10fEqoh4pKat3+MzGv5WSJJGB8PAICLiPcBs4OjMPAq4smw/kuKqSEcBZwDXRETDMNd2MHAa8HxN8/uBmeXPPOBvh7Om0l3AWzPzaOBJ4HMw8ses3NfVFMfoSODcsqaR0AH8t8x8C3AicGFZy8XAPZk5E7inXB4Jn6T4sH+3y4GryrpeBj4+IlXB3wA/zcw3A2+nqHFEj1lETAP+BGjJzLcCDRTP85E4ZjdS/NuqNdDxGQ1/KyRJo4BhYHCfAL6SmVsBMnNV2T4buCUzt2bmM8BS4IRhru0q4DP0/qK22cC3snAfxfc4TB3OojLzZzXfKn0fxfdHdNc2ksfsBGBpZj6dmW3ALWVNwy4zV2Tmv5b3N1K8qJ1W1nNT2e0mYM5w1xYR04E/BK4rlwM4BbhthOvaCziZ8opjmdmWmesYBceM4vtaJkREI7AHsIIROGaZ+QtgbZ/mgY7PiP+tkCSNDoaBwR0O/H55uv+fI+L4sn0a8EJNv2Vl27CIiLOBFzPz131WjWhd/Tgf+El5f6RrG+n99ysiDgWOBe4HpmTmCigCA3DACJT01xQhs6tc3g9YVxPwRuq4HQasBm4opzBdFxHNjPAxy8wXKc4YPk8RAtYDixkdxwwGPj6j8t+DJGn4bfcbiN/oIuJu4MB+Vn2B4vjsQzGV43jguxFxGBD99B/Sa7Rup67PA6f3t9murgsGry0z7yj7fIFiOszNw1nbIEZ6/9uIiInA94H/mpkbijfhR7SeM4FVmbk4ImZ1N/fTdSSOWyPwDuCizLw/Iv6GkZtG1aOcgz8bmAGsA75HMQWnr9F2DefR8rhKkkZY5cNAZp460LqI+ATwg/Lbkh+IiC5gf4p30Q6u6TodWD4cdUXE2yheePy6fPE4HfjXiDhhOOoarLaaGucCZwLvzde+yGJYahvESO+/l4hooggCN2fmD8rmlRExNTNXlFM2Vg08wi5xEnB2RPwBMB7Yi+JMwaSIaCzf6R6p47YMWJaZ95fLt1GEgZE+ZqcCz2TmaoCI+AHwbxgdxwwGPj6j6t+DJGnkOE1ocD+kmPtLRBwOjAXWAAuAcyJiXETMoPgQ3gPDUVBmPpyZB2TmoZl5KMV/6u/IzJfKuj5SXinkRGB99xSB4RIRZ1B8s/TZmbm5ZtWIHbPSImBmeZWXsRQf8lwwjPvvUc7D/ybwm8z8q5pVC4C55f25wB3DWVdmfi4zp5fPq3OAezPzPODnwAdHqq6ytpeAFyLiiLLpvcBjjPAxo5gedGJE7FE+rt11jfgxKw10fEb8b4UkaXTwG4gHUb5ovB44BmgDPp2Z95brvkAxJ76DYprHTwYcaNfW+CzFlUzWlC9Gvk5xRZHNwMcys3WY61kKjAN+Wzbdl5n/uVw3osesfMf7rymu+HJ9Zn55OPdfU8fvAf8HeJjX5uZ/nuJzA98FDqF4kfmhzOz7gdDhqnEWxfP9zHJq3C3AvsCvgP/Q/aH6Ya7pGIoPNo8FngY+RvGGxoges4j4EvBHFM/rXwEXUMy/H9ZjFhHfAWZRnL1cCXyR4g2NbY7PaPhbIUkaHQwDkiRJUkU5TUiSJEmqKMOAJEmSVFGGAUmSJKmiDAOSJElSRRkGJEmSpIoyDEiSJEkVZRiQJEmSKur/AxbM+HVniUJgAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x45b204a8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#显示聚类结果\n",
    "#画出聚类结果，每一类用一种颜色\n",
    "colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868']\n",
    "\n",
    "n_clusters = 10\n",
    "mb_kmeans = MiniBatchKMeans(n_clusters = n_clusters)\n",
    "mb_kmeans.fit(X_train_pca)\n",
    "y_train_pred = mb_kmeans.labels_\n",
    "cents = mb_kmeans.cluster_centers_#质心\n",
    "\n",
    "for i in range(n_clusters):\n",
    "    index = np.nonzero(y_train_pred==i)[0]\n",
    "    x1 = X_train_pca[index,0]\n",
    "    #print(x1)\n",
    "    #x2 = X_train_pca[index,0]\n",
    "    y_i = y_train[index]\n",
    "    for j in index:\n",
    "        if j < 100:\n",
    "            plt.text(x1[j],0,str(int(y_i[j])),color=colors[i],\\\n",
    "                fontdict={'weight': 'bold', 'size': 9})\n",
    "    #plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12)\n",
    "\n",
    "plt.axis([-75,100,-1,1])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这里给出了K=10时的聚类结果。由于降维时仅仅剩一维，所以聚类相当于给出了一个分段函数。然而从图中看，这些分段区间还是有重叠的，类与类间区分的不是很明显。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "开始测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 260,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count_1</th>\n",
       "      <th>count_2</th>\n",
       "      <th>count_3</th>\n",
       "      <th>count_4</th>\n",
       "      <th>count_5</th>\n",
       "      <th>count_6</th>\n",
       "      <th>count_7</th>\n",
       "      <th>count_8</th>\n",
       "      <th>count_9</th>\n",
       "      <th>count_10</th>\n",
       "      <th>...</th>\n",
       "      <th>count_92</th>\n",
       "      <th>count_93</th>\n",
       "      <th>count_94</th>\n",
       "      <th>count_95</th>\n",
       "      <th>count_96</th>\n",
       "      <th>count_97</th>\n",
       "      <th>count_98</th>\n",
       "      <th>count_99</th>\n",
       "      <th>count_100</th>\n",
       "      <th>count_101</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  count_1 count_2 count_3 count_4 count_5 count_6 count_7 count_8 count_9  \\\n",
       "0       2       0       2       0       0       0       0       0       0   \n",
       "1       2       0       2       0       0       0       0       0       0   \n",
       "2       0       0       0       0       0       0       0       0       0   \n",
       "3       0       0       0       0       0       0       0       0       0   \n",
       "4       0       0       0       0       0       0       0       0       0   \n",
       "\n",
       "  count_10    ...    count_92 count_93 count_94 count_95 count_96 count_97  \\\n",
       "0        0    ...           0        1        0        0        0        0   \n",
       "1        0    ...           0        0        0        0        0        0   \n",
       "2        0    ...           0        0        0        0        0        0   \n",
       "3        0    ...           0        0        0        0        0        0   \n",
       "4        0    ...           0        0        0        0        0        0   \n",
       "\n",
       "  count_98 count_99 count_100 count_101  \n",
       "0        0        0         0         9  \n",
       "1        0        0         0         7  \n",
       "2        0        0         0        12  \n",
       "3        0        0         0        12  \n",
       "4        0        0         0        12  \n",
       "\n",
       "[5 rows x 101 columns]"
      ]
     },
     "execution_count": 260,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = pd.merge(df, pd.read_csv(\"test.csv\", converters={'event':str}), on = \"event\", how = 'inner')\n",
    "X_test = test.drop([\"user\",\"invited\",\"timestamp\",\"event\"], axis = 1)\n",
    "X_test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10237, 101)\n"
     ]
    }
   ],
   "source": [
    "print(X_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10237, 1)\n"
     ]
    }
   ],
   "source": [
    "#对测试数据进行PCA降维\n",
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=0.75)\n",
    "pca.fit(X_test)\n",
    "\n",
    "X_test_pca = pca.transform(X_test)\n",
    "\n",
    "# 降维后的特征维数\n",
    "print(X_test_pca.shape) #居然降成了一维？？？"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "metadata": {},
   "outputs": [],
   "source": [
    "mb_kmeans = MiniBatchKMeans(n_clusters = 100) #最优K\n",
    "mb_kmeans.fit(X_train_pca)\n",
    "y_test = mb_kmeans.predict(X_test_pca) #test.csv中没有标签数据，无法评价"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(y_test).to_csv(\"test_result.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
